Smoothing models

Get data - smooth disturbed and control plots together - AGB growth

Code
library(mgcv); library(data.table); library(gratia); library(marginaleffects); library(ggplot2); library(dplyr); library(tidyr); library(zoo); library(slider); library(here);library(broom); library(readr);library(lme4)



data=read_tsv("G:/My Drive/cesab bioforest/Data/data.tsv")

sites=c("Corinto" ,"Jari" , "Lesong" ,"Mbaiki","Misiones", "Paracou",      
"SUAS" ,"Sungai Lalang", "Ulu Muda")

setDT(data)

unique_plots <- data[site %in% sites, .(plots = unique(plot)), by = site]

setnames(data, "plot", "subplot")

data[, plot := gsub("_.*$", "", subplot)]


load("../clim_by_site_total.rda")


plots_df <- do.call(rbind, lapply(names(clim_by_site), function(site_name) {
  data.frame(
    site = site_name,
    plot = unlist(lapply(clim_by_site[[site_name]], function(block) block$plot)),
    stringsAsFactors = FALSE
  )
}))

census_anom_all_sites<- list()

all_sites_results <- list()

for (s in sites) {

  cat("Processing", s, "\n")

  ## ---- subset obs

  obs_site <- data[site == s]
  
  #obs_site2=obs_site[!duplicated(obs_site)]
  
  obs_site2 <- obs_site[rel_year >= 0]


  obs_site_wide <- dcast(
  obs_site2,
  treatment+site + subplot + plot+ year ~ variable,
  value.var = "value"
)


  ## ---- climate stack
  clim_all <- rbindlist(
    lapply(clim_by_site[[s]], function(x) x$climate[[1]]),
    idcol = "plot"
  )

  ## ---- baseline
  baseline <- clim_all[year >= 1981 & year <= 2010]

  clim_stats <- baseline[, .(
    mean_tmax = mean(tmax),
    sd_tmax   = sd(tmax),
    mean_vpd  = mean(vpd),
    sd_vpd    = sd(vpd),
    mean_srad = mean(srad),
    sd_srad   = sd(srad)
  ), by = month]

  clim_all <- merge(clim_all, clim_stats, by = "month", all.x = TRUE)

  ## ---- standardize
  clim_all[, `:=`(
    z_tmax = (tmax - mean_tmax) / sd_tmax,
    z_vpd  = (vpd  - mean_vpd)  / sd_vpd,
    z_srad = (srad - mean_srad) / sd_srad
  )]

  ## ---- match census years - needs to be by plot, as many sites dont have census for every plot every year

  inv_years <- obs_site_wide[, .(year = sort(unique(year))), by = plot]


  clim_all[
    inv_years,
    year_bin := i.year,
    on = .(plot, year),
    roll = TRUE
  ]

  ## ---- aggregate per census
  census_anom <- clim_all[, .(
    mean_z_tmax = mean(z_tmax, na.rm = TRUE),
    mean_z_vpd  = mean(z_vpd,  na.rm = TRUE),
    mean_z_srad = mean(z_srad, na.rm = TRUE)
    ), by = .(plot, year_bin)]#Because I group by year_bin, all pre-first-census months collapse into NA

  ## ---- merge obs + climate
  merged_site <- merge(
  obs_site_wide, 
    census_anom,
    by.x = c("year","plot"),
    by.y = c("year_bin","plot"),
    all.x = TRUE
  )

  ## ---- store
  census_anom_all_sites[[s]] <- census_anom

  all_sites_results[[s]] <- merged_site
}
Processing Corinto 
Processing Jari 
Processing Lesong 
Processing Mbaiki 
Processing Misiones 
Processing Paracou 
Processing SUAS 
Processing Sungai Lalang 
Processing Ulu Muda 
Code
merged_census_anom <- rbindlist(census_anom_all_sites, fill = TRUE, idcol = "site")
merged <- rbindlist(all_sites_results, fill = TRUE)

inv_years_plot <- merged[
  , .(
    n_years = uniqueN(year),
    treatment = unique(treatment)
  ),
  by = .(site, plot)
]


filtered_data <- merged %>%
  # 1. keep only plots with >= 7 inventory years
  group_by(site, plot) %>%
  filter(n_distinct(year) >= 7) %>%
  ungroup() 

unique(filtered_data$site)
[1] "Corinto"       "Jari"          "Lesong"        "Mbaiki"       
[5] "Misiones"      "Paracou"       "SUAS"          "Sungai Lalang"
[9] "Ulu Muda"     
Code
filtered_data %>%
  distinct(site, plot, treatment) %>%
  count(site, treatment)
# A tibble: 18 × 3
   site          treatment     n
   <chr>         <chr>     <int>
 1 Corinto       control       3
 2 Corinto       logged        6
 3 Jari          control       3
 4 Jari          logged       35
 5 Lesong        control       4
 6 Lesong        logged        8
 7 Mbaiki        control       3
 8 Mbaiki        logged        7
 9 Misiones      control       4
10 Misiones      logged       14
11 Paracou       control       6
12 Paracou       logged        9
13 SUAS          control       4
14 SUAS          logged       16
15 Sungai Lalang control       3
16 Sungai Lalang logged        6
17 Ulu Muda      control       3
18 Ulu Muda      logged        6
Code
setDT(filtered_data)

inv_years_plot <- filtered_data[
  , .(
    n_years = uniqueN(year),
    treatment = unique(treatment)
  ),
  by = .(site, plot)
]

inventories2=filtered_data[!is.na(gini),.(n_years=uniqueN(year)),
  by=.(site, plot)]

#nok for agb_mort

Visualize climate anomalies - Tmax per site

Code
site_order <- c(
  "Paracou", "Corinto", "Jari", "Misiones",
  "Mbaiki",
  "Ulu Muda", "Sungai Lalang", "Lesong", "SUAS"
)

merged_census_anom[, site := factor(site, levels = site_order)]

plot_census_anom <- merged_census_anom[
  !is.na(year_bin),
  .(mean_z_tmax = mean(mean_z_tmax, na.rm = TRUE)),  # collapse multiple values
  by = .(site, year_bin)
]

p=ggplot(plot_census_anom, aes(x = year_bin, y = mean_z_tmax)) +
  geom_point(aes(color = mean_z_tmax >= 0), size = 2) +
  geom_hline(yintercept = 0, linetype = "dashed") +
    geom_segment(
    aes(
      x = year_bin,
      xend = year_bin,
      y = 0,
      yend = mean_z_tmax
    ),
    color = "grey50"
  ) +
  facet_wrap(~ site, scales = "free_y") +  # separate y-axis per site
  #scale_x_continuous(breaks = sort(unique(census_anom$year_bin))) +
  scale_color_manual(values = c("TRUE" = "red", "FALSE" = "blue"), 
                     labels = c("Negative", "Positive")) +
  labs(x = "Year", y = "Tmax anomaly", color = "Sign") +
  theme_minimal()
p

GAM flexible k for min 7 inventories, with and without random effects

**

Code
merged=filtered_data[, subplot := as.factor(subplot)]

merged_sites_results <- list()


for (s in sites) {
  merged_site <- merged[site == s]

merged2 <- merged_site %>%
  group_by(subplot) %>%
  mutate(
    fitted_GAM = {
      df <- cur_data()
      df_nonNA <- df[!is.na(df$agb_growth), ]
      
      if (uniqueN(year) <= 20) {  
          k_use <- 3
          predict(
          gam(agb_growth ~ s(year, k = k_use), data = cur_data()),
          newdata = cur_data()
        )
      } else {
        k_use <- 5
          predict(
          gam(agb_growth ~ s(year, k = k_use), data = cur_data()),
          newdata = cur_data())
      }
    }
  ) %>%
  ungroup()

merged3 <- merged_site %>%
  mutate(
    fitted_GAMre = {
      df <- cur_data()
      df_nonNA <- df[!is.na(df$agb_growth), ]
      
      if (uniqueN(year) <= 20) {  
      predict(
      gam(agb_growth ~ s(year, k = 3)+
            s(subplot, bs="re")+
            s(subplot, year, bs="re"), data = cur_data()),
      newdata = cur_data()
    )
      } else {
      predict(
      gam(agb_growth ~ s(year, k = 5)+
            s(subplot, bs="re")+
            s(subplot, year, bs="re"), data = cur_data()),
      newdata = cur_data())
      }
    }
  ) 

  merged_combined <- cbind(merged2, fitted_GAMre = merged3$fitted_GAMre)



  ## ---- store
  merged_sites_results[[s]] <- merged_combined
}


merged_results <- rbindlist(merged_sites_results, fill = TRUE)

merged_results[
  , .(n_rows = .N,
      n_nonNA_fitted = sum(!is.na(fitted_GAM))),
  by = site
]
            site n_rows n_nonNA_fitted
          <char>  <int>          <int>
1:       Corinto     96             96
2:          Jari    266            266
3:        Lesong    144            144
4:        Mbaiki   1240           1240
5:      Misiones    126            126
6:       Paracou   1508           1508
7:          SUAS    260            260
8: Sungai Lalang     99             99
9:      Ulu Muda    108            108
Code
merged_results=merged_results[!is.na(fitted_GAM)]

p_all_sites <- ggplot(merged_results, aes(x = year, y = agb_growth - fitted_GAM)) +
  geom_point(alpha = 0.6) +
  facet_wrap(~ site, scales = "free_y") +  # separate y-axis per site
  ggtitle("GAM residuals per site") +
  xlab("Year") +
  ylab("Residuals (agb_growth - fitted_GAM)") +
  theme_minimal()

p_all_sites

Code
#Residuals should not be systematically correlated with time
Code
merged_results=merged_results[!is.na(fitted_GAMre)]

p_all_sites <- ggplot(merged_results, aes(x = year, y = agb_growth - fitted_GAMre)) +
  geom_point(alpha = 0.6) +
  facet_wrap(~ site, scales = "free_y") +  # separate y-axis per site
  ggtitle("GAM residuals per site - random effects") +
  xlab("Year") +
  ylab("Residuals (agb_growth - fitted_GAMre)") +
  theme_minimal()

p_all_sites

Code
#Residuals should not be systematically correlated with time
Code
for (s in sites) {
  
  df_site <- merged_results[merged_results$site == s, ]
  
  p <- ggplot(df_site, aes(x = year, y = fitted_GAM, color = plot)) +
    geom_line(size = 1) +
    geom_point(aes(y = agb_growth), size = 2) +
    facet_wrap(~ subplot, scales = "free_y") +
    theme_bw() +
    labs(
      title = paste("GAM predicted vs observed -", s),
      x = "Year",
      y = "agb_growth"
    ) +
    theme(legend.position = "none")
  
  print(p)  # This prints the plot in Quarto
}

Code
for (s in sites) {
  
  df_site <- merged_results[merged_results$site == s, ]
  
  p <- ggplot(df_site, aes(x = year, y = fitted_GAMre, color = subplot)) +
    geom_line(size = 1) +
    geom_point(aes(y = agb_growth), size = 2) +
    facet_wrap(~ subplot, scales = "free_y") +
    theme_bw() +
    labs(
      title = paste("GAM predicted vs observed re-", s),
      x = "Year",
      y = "agb_growth"
    ) +
    theme(legend.position = "none")
  
  print(p)  # This prints the plot in Quarto
}

GAM at edges - higher standard error, GAM uses many basis functions (Thin Plate Regression Splines (TPRS)) combined to estimate an unknown curve. Instead of minimizing just the residual error the model penalize large curvatures, which can be demonstrated with the standard errors. (for bs=fs, penalty is Shared among the subplots – it is supposed to control for overfitting) **

Error per subplot per site

Code
for (s in sites) {
  
  m_site <- merged_results[merged_results$site == s, ]
  
p=ggplot(m_site, aes(year, agb_growth-fitted_GAM)) +
geom_point(alpha = 0.3) +
facet_wrap(~ subplot) +
geom_hline(yintercept = 0, linetype = 2) +
labs(
 title = paste("Directional GAM errors by subplot-", s),
 y = "Residual")
  print(p)  # This prints the plot in Quarto
}

root mean square error

Code
rmse_results <- merged_results %>%
  group_by(site, subplot) %>%
  summarise(
    RMSE_GAM   = sqrt(mean((agb_growth - fitted_GAM)^2, na.rm = TRUE)),
    RMSE_GAMre   = sqrt(mean((agb_growth - fitted_GAMre)^2, na.rm = TRUE)),
  ) %>% 
  ungroup()


rmse_long <- rmse_results %>%
  pivot_longer(
    cols = starts_with("RMSE"),
    names_to = "Model",
    values_to = "RMSE"
  )

rmse_long <- rmse_long %>%
  mutate(Model = reorder(Model, RMSE))

ggplot(rmse_long, aes(x = Model, y = RMSE)) +
  geom_boxplot() +
  theme_bw() +
  facet_wrap(~site)

Code
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10)
  )
<theme> List of 1
 $ axis.text.x: <ggplot2::element_text>
  ..@ family       : NULL
  ..@ face         : NULL
  ..@ italic       : chr NA
  ..@ fontweight   : num NA
  ..@ fontwidth    : num NA
  ..@ colour       : NULL
  ..@ size         : num 10
  ..@ hjust        : num 1
  ..@ vjust        : NULL
  ..@ angle        : num 45
  ..@ lineheight   : NULL
  ..@ margin       : NULL
  ..@ debug        : NULL
  ..@ inherit.blank: logi FALSE
 @ complete: logi FALSE
 @ validate: logi TRUE

Testing full models

Code
#normality?

for(s in sites){
  
    cat("Processing", s, "\n")

    merged_site <- merged[site == s]

  print(hist(merged_site$agb_growth))
  
  }
Processing Corinto 

$breaks
 [1]  0  2  4  6  8 10 12 14 16 18

$counts
[1]  2  7 12 29 23 16  2  4  1

$density
[1] 0.010416667 0.036458333 0.062500000 0.151041667 0.119791667 0.083333333
[7] 0.010416667 0.020833333 0.005208333

$mids
[1]  1  3  5  7  9 11 13 15 17

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Processing Jari 

$breaks
 [1] 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5

$counts
[1]  2 12 25 42 58 65 37 20  5

$density
[1] 0.01503759 0.09022556 0.18796992 0.31578947 0.43609023 0.48872180 0.27819549
[8] 0.15037594 0.03759398

$mids
[1] 2.25 2.75 3.25 3.75 4.25 4.75 5.25 5.75 6.25

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Processing Lesong 

$breaks
 [1]  0  1  2  3  4  5  6  7  8  9 10 11

$counts
 [1]  1  3  4 10 14 26 23 21 17  8  5

$density
 [1] 0.007575758 0.022727273 0.030303030 0.075757576 0.106060606 0.196969697
 [7] 0.174242424 0.159090909 0.128787879 0.060606061 0.037878788

$mids
 [1]  0.5  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Processing Mbaiki 

$breaks
 [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19

$counts
 [1]   1  11  57 132 176 233 168 164  92  75  43  22  32  14  12   6   1   1

$density
 [1] 0.0008064516 0.0088709677 0.0459677419 0.1064516129 0.1419354839
 [6] 0.1879032258 0.1354838710 0.1322580645 0.0741935484 0.0604838710
[11] 0.0346774194 0.0177419355 0.0258064516 0.0112903226 0.0096774194
[16] 0.0048387097 0.0008064516 0.0008064516

$mids
 [1]  1.5  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5 12.5 13.5 14.5 15.5
[16] 16.5 17.5 18.5

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Processing Misiones 

$breaks
[1] 1 2 3 4 5 6 7 8

$counts
[1] 14 44 41 17  6  2  2

$density
[1] 0.11111111 0.34920635 0.32539683 0.13492063 0.04761905 0.01587302 0.01587302

$mids
[1] 1.5 2.5 3.5 4.5 5.5 6.5 7.5

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Processing Paracou 

$breaks
 [1]  1.5  2.0  2.5  3.0  3.5  4.0  4.5  5.0  5.5  6.0  6.5  7.0  7.5  8.0  8.5
[16]  9.0  9.5 10.0

$counts
 [1]   3  10  43 125 197 200 272 209 163 115  64  37  35   8   8   5   2

$density
 [1] 0.004010695 0.013368984 0.057486631 0.167112299 0.263368984 0.267379679
 [7] 0.363636364 0.279411765 0.217914439 0.153743316 0.085561497 0.049465241
[13] 0.046791444 0.010695187 0.010695187 0.006684492 0.002673797

$mids
 [1] 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25 5.75 6.25 6.75 7.25 7.75 8.25 8.75
[16] 9.25 9.75

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Processing SUAS 

$breaks
 [1]  4  6  8 10 12 14 16 18 20 22 24 26 28

$counts
 [1]  8 53 60 67 39 18  3  4  4  1  1  2

$density
 [1] 0.015384615 0.101923077 0.115384615 0.128846154 0.075000000 0.034615385
 [7] 0.005769231 0.007692308 0.007692308 0.001923077 0.001923077 0.003846154

$mids
 [1]  5  7  9 11 13 15 17 19 21 23 25 27

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Processing Sungai Lalang 

$breaks
 [1]  0  2  4  6  8 10 12 14 16 18 20 22 24

$counts
 [1]  1  7 17 27 16  9  5  2  3  1  1  1

$density
 [1] 0.005555556 0.038888889 0.094444444 0.150000000 0.088888889 0.050000000
 [7] 0.027777778 0.011111111 0.016666667 0.005555556 0.005555556 0.005555556

$mids
 [1]  1  3  5  7  9 11 13 15 17 19 21 23

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Processing Ulu Muda 

$breaks
 [1]  2  3  4  5  6  7  8  9 10 11 12

$counts
 [1]  3  6 13 21 24 18  7  2  3  2

$density
 [1] 0.03030303 0.06060606 0.13131313 0.21212121 0.24242424 0.18181818
 [7] 0.07070707 0.02020202 0.03030303 0.02020202

$mids
 [1]  2.5  3.5  4.5  5.5  6.5  7.5  8.5  9.5 10.5 11.5

$xname
[1] "merged_site$agb_growth"

$equidist
[1] TRUE

attr(,"class")
[1] "histogram"
Code
table_results_combined <- list()

for (s in sites) {
  merged_site <- merged[site == s]
  
  # choose k based on unique years
  k_use <- ifelse(uniqueN(merged_site$year) <= 20, 3, 5)
  
  cat("Site:", s, "- unique years:", uniqueN(merged_site$year), "- k used:", k_use, "\n")

  # fit GAM 1
  mod <- gam(
    agb_growth ~ subplot +
      s(year, by = subplot, k = k_use) + mean_z_tmax:treatment,
    data = merged_site,
    method = "REML",
    na.action = na.omit
  )
  
  # parametric terms
  param_tab <- broom::tidy(mod, parametric = TRUE) %>% mutate(Term_type = "parametric")
  
  # smooth terms
  smooth_tab <- broom::tidy(mod, parametric = FALSE) %>% mutate(Term_type = "smooth")
  
  # combine parametric + smooth for this site
  tab_mod    <- bind_rows(param_tab, smooth_tab) %>%
                  mutate(Model_type = "simple_GAM",
                         Model = s)
  # fit GAM 2
  mod2 <- gam(
    agb_growth ~ 
      s(year, k = k_use) +
      s(subplot, bs="re")+
      s(subplot, year, bs="re")+
      mean_z_tmax:treatment,
    data = merged_site,
    method = "REML",
    na.action = na.omit
  )
      
  param_tab2  <- broom::tidy(mod2, parametric = TRUE) %>% mutate(Term_type = "parametric")
  smooth_tab2 <- broom::tidy(mod2, parametric = FALSE) %>% mutate(Term_type = "smooth")
  tab_mod2    <- bind_rows(param_tab2, smooth_tab2) %>%
                    mutate(Model_type = "GAM_re",
                           Model = s)
  
  # fit lm
  mod3 <- lme4::lmer(
    agb_growth ~ year+
      mean_z_tmax:treatment +(year | subplot),#equivalent of s(subplot, bs="re") and s(subplot, year, bs="re"), one intercept and slope per subplot
    data = merged_site,
    REML =TRUE,
    na.action = na.omit
  )
      
tab_mod3 <- broom.mixed::tidy(
  mod3,
  effects = "fixed"
) %>%
  mutate(
    Term_type  = "parametric",
    Model_type = "LMM",
    Model      = s
  )  


  
  # ---- combine both models for this site ----
  table_results_combined[[s]] <- bind_rows(tab_mod, tab_mod2, tab_mod3)
}
Site: Corinto - unique years: 11 - k used: 3 
Site: Jari - unique years: 7 - k used: 3 
Site: Lesong - unique years: 12 - k used: 3 
Site: Mbaiki - unique years: 31 - k used: 5 
Site: Misiones - unique years: 8 - k used: 3 
Site: Paracou - unique years: 31 - k used: 5 
Site: SUAS - unique years: 13 - k used: 3 
Site: Sungai Lalang - unique years: 11 - k used: 3 
Site: Ulu Muda - unique years: 12 - k used: 3 
Code
# combine all sites into one table
final_table_combined <- bind_rows(table_results_combined) %>%
  # optional: format estimates & stats nicely
  mutate(
    Estimate  = ifelse(Term_type == "parametric", round(estimate, 3), NA),
    SE        = ifelse(Term_type == "parametric", round(std.error, 3), NA),
    Statistic = ifelse(Term_type == "smooth", round(statistic, 3), NA),
    p_value   = format.pval(p.value, digits = 3)
  ) %>%
  select(Model, Model_type, term, Term_type, Estimate, SE, Statistic, p_value)

final_table_combined <- final_table_combined %>%
  filter(grepl("treatment", term))


for (s in sites) {
  
  # keep only treatment parametric terms for this site
  m_site <- final_table_combined %>%
    filter(Model == s, Term_type == "parametric", grepl("treatment", term)) %>%
    mutate(Term_clean = gsub("mean_z_tmax:treatment", "", term))  # control / disturbed
  
  p <- ggplot(m_site,
              aes(x = Estimate,
                  y = Model_type,   # model type on y-axis
                  color = Term_clean)) +  # treatment (control/disturbed) as color
    geom_point(size = 3, position = position_dodge(width = 0.6)) +
    geom_errorbarh(aes(xmin = Estimate - SE * 1.96,
                       xmax = Estimate + SE * 1.96),
                   height = 0.3, position = position_dodge(width = 0.6)) +
    geom_vline(xintercept = 0, linetype = "dashed") +
    labs(x = "Treatment effect (±95% CI)",
         y = "Model type",
         title = paste("Treatment effect -", s),
         color = "Treatment") +
    theme_minimal()
  
  print(p)
}